WSDM - Fake News Classification
https://www.kaggle.com/c/fake-news-pair-classification-challenge
In [0]:
# Imports here
!pip install torch
!pip install torchvision
import torch,torchvision
#!pip install -I pillow
import numpy as np
import torchvision.transforms as transforms
from torch.autograd import Variable
#!pip install Pillow==4.0.0
#!pip install PIL
#!pip install image
#import PIL
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import re
from nltk.corpus import stopwords
import pickle
import nltk
from collections import defaultdict
import copy
from collections import Counter
from tqdm import tqdm as tqdm
import pandas as pd
import numpy as np
from torch.optim.lr_scheduler import CosineAnnealingLR
data_dir = '/drive/My Drive/Study/fakenews/'
isPreprocess = True
from google.colab import drive
drive.mount('/drive/')
In [0]:
In [0]:
import os
print(os.listdir("."))
print(os.listdir("/drive/My Drive"))
#os.chdir("drive/Colab/")
#print(os.listdir("."))
PREPROCESSING
In [0]:
'''
#nltk.download('stopwords')
stopwords_en = stopwords.words('english')
stopwords_en.remove("not")
stopwords_en.remove("no")
stopwords_en.remove("nor")
'''
stopwords_en = ['the', 'a' 'an']
def make_new_data(df):
title1_en = list(df["title1_en"])
title2_en = list(df["title2_en"])
title1_zh = list(df["title1_zh"])
title2_zh = list(df["title2_zh"])
labels = list(df["label"])
id1_train = list(df["tid1"])
id2_train = list(df["tid2"])
# id-text dictionary
id_to_text_en = defaultdict(list)
id_to_text_zh = defaultdict(list)
for idx, id1 in enumerate(id1_train):
#if not id1 in id_to_text_en.keys():
id_to_text_en[id1] = title1_en[idx]
id_to_text_zh[id1] = title1_zh[idx]
for idx, id2 in enumerate(id2_train):
#if not id2 in id_to_text_en.keys():
id_to_text_en[id2] = title2_en[idx]
id_to_text_zh[id2] = title2_zh[idx]
# key : id,
# value : id of agreed text or diagreed text.
agree_dic = defaultdict(list)
disagree_dic = defaultdict(list)
given_dic = defaultdict(list)
bidirection_dic = defaultdict(list)
fixed_dic = defaultdict(list)
for idx, id1 in enumerate(id1_train):
label = labels[idx]
id2 = id2_train[idx]
given_dic[id1].append((id2, label))
# for idx, id1 in enumerate(id1_train):
# label = labels[idx]
# id2 = id2_train[idx]
# given_dic[id1].append((id2, label))
for idx, id1 in enumerate(id1_train):
label = labels[idx]
id2 = id2_train[idx]
if not len(fixed_dic[id1]) == 0:
already_given_id = np.array(fixed_dic[id1])[:,0]
already_given_label = np.array(fixed_dic[id1])[:,1]
if not id2 in already_given_id:
fixed_dic[id1].append([id2, label])
else:
id2_idx = list(already_given_id).index(id2)
already_given = already_given_label[id2_idx]
if not label == already_given:
#print(id1, id2, already_given, label)
if label == 0:
pass
elif label == 1 and already_given == 0:
true_label = 1
fixed_dic[id1][id2_idx][1] = true_label
elif label == 2 or already_given == 2:
true_label = 2
fixed_dic[id1][id2_idx][1] = true_label
#print(id1, given_dic[id1][id2_idx])
else:
pass
else:
#最初に登録するとき
fixed_dic[id1].append([id2, label])
if not len(fixed_dic[id2]) == 0:
already_given_id = np.array(fixed_dic[id2])[:,0]
already_given_label = np.array(fixed_dic[id2])[:,1]
if not id1 in already_given_id:
fixed_dic[id2].append([id1, label])
else:
id1_idx = list(already_given_id).index(id1)
already_given = already_given_label[id1_idx]
if not label == already_given:
if label == 0:
pass
elif label == 1 and already_given == 0:
true_label = 1
fixed_dic[id2][id1_idx][1] = true_label
elif label == 2 or already_given == 2:
true_label = 2
fixed_dic[id2][id1_idx][1] = true_label
else:
fixed_dic[id2].append([id1, label])
print("agree dic:{}, disagree dic:{}".format(len(agree_dic), len(disagree_dic)))
fixed_dic_cleaned = copy.deepcopy(fixed_dic)
print("deleting dublicates")
for id_, id_label_list in tqdm(fixed_dic_cleaned.items()):
#print(id_label_list)
if len(id_label_list) == 0:
continue
id_list = np.array(id_label_list)[:,0]
for eachid in id_list:
id_label_list2 = fixed_dic_cleaned[eachid]
if len(id_label_list2) == 0:
continue
id_list2 = list(np.array(id_label_list2)[:,0])
if id_ in id_list2:
idx = list(id_list2).index(id_)
id_label_list2.pop(idx)
for id1, id_label_list in fixed_dic.items():
if len(id_label_list) == 0:
continue
id_list = np.array(id_label_list)[:,0]
label_list = np.array(id_label_list)[:,1]
for id2, label in zip(id_list, label_list):
if label == 1:
agree_dic[id1].append(id2)
elif label == 2:
disagree_dic[id1].append(id2)
new_data = []
given_label_agree = []
given_label_dis = []
for id1, id_label_list in fixed_dic_cleaned.items():
if len(id_label_list) == 0:
continue
id2_list = np.array(id_label_list)[:,0]
label_list = np.array(id_label_list)[:,1]
for id2, label in zip(id2_list, label_list):
new_data.append((id_to_text_en[id1], id_to_text_en[id2], id_to_text_zh[id1], id_to_text_zh[id2], label))
print("fixed data length:{}, original:{}".format(len(new_data), len(id1_train)))
forecast_dic = defaultdict(list)
for id_, agree_ids in agree_dic.items():
disagree_ids = disagree_dic[id_]
for agree_id in agree_ids:
given_ids_labels= fixed_dic[agree_id]
if len(given_ids_labels) == 0:
continue
given_ids = np.array(given_ids_labels)[:, 0]
given_labels = np.array(given_ids_labels)[:, 1]
assert given_ids.shape == given_labels.shape
# new 'disagree data'
for disagree_id in disagree_ids:
if disagree_id in given_ids:
# When labels are already given
idx = list(given_ids).index(disagree_id)
label = given_labels[idx]
given_label_dis.append(label)
pass
else:
# hen the label is not given explicitly
forecast_dic[agree_id].append((disagree_id, 2))
forecast_dic[disagree_id].append((agree_id, 2))
new_data.append((id_to_text_en[agree_id], id_to_text_en[disagree_id], id_to_text_zh[agree_id], id_to_text_zh[disagree_id], 2))
# new 'agree data'
for agree_id2 in agree_ids:
if agree_id == agree_id2:
continue
else:
if agree_id2 in given_ids:
# when labels are already given
idx = list(given_ids).index(agree_id2)
label = given_labels[idx]
given_label_agree.append(label)
pass
else:
pass
# when the label is not given explicitly.
forecast_dic[agree_id].append((agree_id2, 1))
forecast_dic[agree_id2].append((agree_id, 1))
# new_data.append((id_to_text_en[agree_id], id_to_text_en[agree_id2], id_to_text_zh[agree_id], id_to_text_zh[agree_id2], 1))
# c = Counter(given_label_agree)
# print("given_label_agree", c)
# c = Counter(given_label_dis)
# print("given_label_disagree", c)
print("final data length:",len(new_data))
with open(data_dir + 'save/fixed_dic.pickle', mode='wb') as f:
pickle.dump(fixed_dic, f)
with open(data_dir + 'save/given_dic.pickle', mode='wb') as f:
pickle.dump(given_dic, f)
with open(data_dir + 'save/forecast_dic.pickle', mode='wb') as f:
pickle.dump(forecast_dic, f)
return new_data, given_dic, fixed_dic, forecast_dic
def preprocess_():
train_df = pd.read_csv(data_dir + "train.csv")
test_df = pd.read_csv(data_dir + "test.csv")
sub = pd.read_csv(data_dir + "sample_submission.csv")
def english_clean_series(series):
# Uppercase letters ---> lowercase letters
series = series.str.lower()
def clean_seq(seq):
seq = seq.replace("it's", "it is")
seq = seq.replace("he's", "he is")
seq = seq.replace("she's", "she is")
seq = seq.replace("you're", "you are")
seq = seq.replace("we're", "we are")
seq = seq.replace("they're", "they are")
seq = seq.replace("i'm", "i am")
seq = seq.replace("don't", "do not")
seq = seq.replace("does't", "does not")
seq = seq.replace("didn't", "did not")
seq = seq.replace("aren't", "are not")
seq = seq.replace("weren't", "were not")
seq = seq.replace("isn't", "is not")
seq = seq.replace("wasn't", "was not")
seq = seq.replace("haven't", "have not")
seq = seq.replace("hasn't", "has not")
seq = seq.replace("can't", "can not")
seq = seq.replace("cannot", "can not")
seq = seq.replace("shouldn't", "should not")
seq = seq.replace("wouldn't", "would not")
seq = seq.replace("couldn't", "could not")
seq = seq.replace("mightn't", "might not")
seq = seq.replace("mustn't", "must not")
seq = seq.replace("needn't", "need not")
seq = seq.replace("won't", "will not")
seq = seq.replace("'s", "")
seq = seq.replace("\n", "")
seq = seq.replace("[", "")
seq = seq.replace("]", "")
seq = seq.replace(" the ", " ")
seq = seq.replace(" a ", " ")
seq = seq.replace(" an ", " ")
seq = seq.replace("< i >", "")
seq = seq.replace("< / i >", "")
seq = re.sub(r'[,."''“”。、#()→⇒←↓↑:;_㊙️【《》=|/+<>]+', '', seq)
seq = seq.replace(r'-', ' - ')
seq = seq.replace(r'!', ' ! ')
seq = seq.replace(r'?', ' ? ')
seq = seq.replace(r'?', ' ? ')
seq = seq.replace(r'!', ' ! ')
seq = seq.replace(r'?', ' ? ')
seq = re.sub(r'[$]+', '$ ', seq)
seq = re.sub(r'[0-9]+', '<NUM>', seq)
seq_split = seq.split(" ")
new_seq = ""
for word in seq_split:
if not word in stopwords_en:
new_seq += word
new_seq += " "
return new_seq
'''
with open('save/top_words.pickle', mode='rb') as f:
top_words = pickle.load(f)
# Leave frequent top 20000 words. Do we need them???
seq = new_seq
seq_split = seq.split(" ")
new_seq = ""
for word in seq_split:
if word in top_words:
new_seq += word
new_seq += " "
return new_seq
series = series.apply(clean_seq)
'''
return series.apply(clean_seq)
def chinese_clean_series(series):
def clean_seq(seq):
seq = str(seq)
seq = seq.replace("< i >", "")
seq = seq.replace("< / i >", "")
seq = seq.replace("\n", "")
seq = re.sub(r'[,."''“”。、#()→⇒←↓↑:;_㊙️【《》=|/<>]+', '', seq)
#seq = re.sub(r'[!!??-]+', ' ', seq)
seq = seq.replace(r'-', ' - ')
seq = seq.replace(r'!', ' ! ')
seq = seq.replace(r'?', ' ? ')
seq = seq.replace(r'?', ' ? ')
seq = seq.replace(r'!', ' ! ')
seq = seq.replace(r'?', ' ? ')
seq = re.sub(r'[$]+', '$ ', seq)
seq = re.sub(r'万', '00', seq)
seq = re.sub(r'[0-9]+', '<NUM>', seq)
return seq
series = series.apply(clean_seq)
return series
train_df["title1_en"] = english_clean_series(train_df["title1_en"])
train_df["title2_en"] = english_clean_series(train_df["title2_en"])
train_df["title1_zh"] = chinese_clean_series(train_df["title1_zh"])
train_df["title2_zh"] = chinese_clean_series(train_df["title2_zh"])
test_df["title1_en"] = english_clean_series(test_df["title1_en"])
test_df["title2_en"] = english_clean_series(test_df["title2_en"])
test_df["title1_zh"] = chinese_clean_series(test_df["title1_zh"])
test_df["title2_zh"] = chinese_clean_series(test_df["title2_zh"])
train_df.replace('unrelated', 0, inplace=True)
train_df.replace('agreed', 1, inplace=True)
train_df.replace('disagreed', 2, inplace=True)
y = list(train_df["label"])
#Create a word dictionary
train_t1_en = train_df["title1_en"]
train_t2_en = train_df["title2_en"]
test_t1_en = test_df["title1_en"]
test_t2_en = test_df["title2_en"]
train_t1_zh = train_df["title1_zh"]
train_t2_zh = train_df["title2_zh"]
test_t1_zh = test_df["title1_zh"]
test_t2_zh = test_df["title2_zh"]
label = train_df["label"]
print(train_t1_en.head())
word_to_ix_en = {}
for title1, title2 in zip(tqdm(train_t1_en), train_t2_en):
for word in title1.split():
if word not in word_to_ix_en.keys():
word_to_ix_en[word] = len(word_to_ix_en)+1
for word in title2.split():
if word not in word_to_ix_en.keys():
word_to_ix_en[word] = len(word_to_ix_en)+1
for title1, title2 in zip(tqdm(test_t1_en), test_t2_en):
for word in title1.split():
if word not in word_to_ix_en.keys():
word_to_ix_en[word] = len(word_to_ix_en)+1
for word in title2.split():
if word not in word_to_ix_en.keys():
word_to_ix_en[word] = len(word_to_ix_en)+1
#Chinese
word_to_ix_zh = {}
for title1, title2 in zip(tqdm(train_t1_zh), train_t2_zh):
for word in title1:
if word not in word_to_ix_zh.keys():
word_to_ix_zh[word] = len(word_to_ix_zh)+1
for word in title2:
if word not in word_to_ix_zh.keys():
word_to_ix_zh[word] = len(word_to_ix_zh)+1
for title1, title2 in zip(tqdm(test_t1_zh), test_t2_zh):
for word in title1:
if word not in word_to_ix_zh.keys():
word_to_ix_zh[word] = len(word_to_ix_zh)+1
for word in title2:
if word not in word_to_ix_zh.keys():
word_to_ix_zh[word] = len(word_to_ix_zh)+1
print("the number of english words:{}, chinese words:{}".format(len(word_to_ix_en), len(word_to_ix_zh)))
with open(data_dir + 'save/word_to_ix_en.pickle', mode='wb') as f:
pickle.dump(word_to_ix_en, f)
with open(data_dir + 'save/word_to_ix_zh.pickle', mode='wb') as f:
pickle.dump(word_to_ix_zh, f)
with open(data_dir + 'save/train_df.pickle', mode='wb') as f:
pickle.dump(train_df, f)
with open(data_dir + 'save/test_df.pickle', mode='wb') as f:
pickle.dump(test_df, f)
print("cleaned df, word to ix saved.")
# The agree articles to A may be in a disagreeous relationship with disagree articles in A?
# with open('save/word_to_ix_en.pickle', mode='rb') as f:
# word_to_ix_en = pickle.load(f)
# with open('save/word_to_ix_zh.pickle', mode='rb') as f:
# word_to_ix_zh = pickle.load(f)
# with open('save/train_df.pickle', mode='rb') as f:
# train_df = pickle.load(f)
# with open('save/test_df.pickle', mode='rb') as f:
# test_df = pickle.load(f)
#
# title1_en = list(train_df["title1_en"])
# title2_en = list(train_df["title2_en"])
# title1_zh = list(train_df["title1_zh"])
# title2_zh = list(train_df["title2_zh"])
# labels = list(train_df["label"])
#
# id1 = list(train_df["tid1"])
# id2 = list(train_df["tid2"])
#
# #id1_train, id1_val, train1_en, val1_en, train1_zh, val1_zh, id2_train, id2_val, train2_en, val2_en,train2_zh, val2_zh, y_train, y_val = train_test_split(id1, title1_en, title1_zh, id2, title2_en, title2_zh, labels, test_size=0.2, random_state=0)
# training_df, val_df = train_test_split(train_df, test_size=0.2, random_state=0)
#
#
# #new_data, _ = make_new_data(id1_train, id2_train, train1_en, train2_en, y_train)
# new_data, _, _ = make_new_data(training_df)
#
# #print(len(new_data_en))
#
# train1_en, train2_en = [],[]
# train1_zh, train2_zh = [],[]
# y_train = []
# for text1_en, text2_en, text1_zh, text2_zh,label in new_data:
# train1_en.append(text1_en)
# train2_en.append(text2_en)
# train1_zh.append(text1_zh)
# train2_zh.append(text2_zh)
# y_train.append(label)
#
# # new_data_zh, _ = make_new_data(id1_train, id2_train, train1_zh, train2_zh, y_train)
# # print(len(new_data_zh))
# # for text1, text2, label in new_data_zh:
# # train1_zh.append(text1)
# # train2_zh.append(text2)
# #
#
# val1_en, val2_en = list(val_df["title1_en"]), list(val_df["title2_en"])
# val1_zh, val2_zh = list(val_df["title1_zh"]), list(val_df["title2_zh"])
# y_val = list(val_df["label"])
#
# assert len(train1_zh)==len(train1_en) and len(y_train)==len(train1_zh)
#
#
#
# print("training data:{}, validation data:{}".format(len(y_train), len(y_val)))
return 0
# return (train1_en, val1_en, train1_zh, val1_zh, train2_en, val2_en,train2_zh, val2_zh, y_train, y_val)
if isPreprocess==True:
preprocess_()
MODELS
In [0]:
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm as tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import re
from nltk.corpus import stopwords
import nltk
class LSTM_Classifier(nn.Module):
def __init__(self, embedding_dim, hidden_dim, vocab_size_en, vocab_size_zh, target_size=3, seq_length_en=50, seq_length_zh=140):
super(LSTM_Classifier, self).__init__()
self.hidden_dim = hidden_dim
self.embedding_dim = embedding_dim
self.word_embeddings_en = nn.Embedding(vocab_size_en+1, embedding_dim, padding_idx=0)
self.word_embeddings_zh = nn.Embedding(vocab_size_zh+1, embedding_dim, padding_idx=0)
# The LSTM takes word embeddings as inputs, and outputs hidden states
# with dimensionality hidden_dim.
self.lstm_en = nn.LSTM(embedding_dim, hidden_dim, batch_first=False, num_layers=2)
self.lstm_zh = nn.LSTM(embedding_dim, hidden_dim, batch_first=False, num_layers=2)
# The linear layer that maps from hidden state space to tag space
self.fc1 = nn.Linear(hidden_dim*2, hidden_dim*2)
self.fc1_drop = nn.Dropout(p=0.5, inplace=False)
self.fc2 = nn.Linear(hidden_dim*2, target_size)
self.initial_hidden = self.init_hidden()
self.seq_length_en=seq_length_en
self.seq_length_zh=seq_length_zh
def init_hidden(self):
# Before we've done anything, we dont have any hidden state.
# Refer to the Pytorch documentation to see exactly
# why they have this dimensionality.
# The axes semantics are (num_layers, minibatch_size, hidden_dim)
return (torch.zeros(1, 1, self.hidden_dim),
torch.zeros(1, 1, self.hidden_dim))
def forward(self, title1_en, title2_en, title1_zh, title2_zh):
batch = title1_en.shape[0]
embeds1_en = self.word_embeddings_en(title1_en)
embeds2_en = self.word_embeddings_en(title2_en)
embeds1_zh = self.word_embeddings_zh(title1_zh)
embeds2_zh = self.word_embeddings_zh(title2_zh)
# seq_length * batch * feature_dims
embeds1_en = embeds1_en.view(self.seq_length_en, batch, self.embedding_dim)
embeds2_en = embeds2_en.view(self.seq_length_en, batch, self.embedding_dim)
embeds1_zh = embeds1_zh.view(self.seq_length_zh, batch, self.embedding_dim)
embeds2_zh = embeds2_zh.view(self.seq_length_zh, batch, self.embedding_dim)
#print("embeds1_en", embeds1_en.size())
lstm_out1_en, self.hidden = self.lstm_en(embeds1_en)#, self.initial_hidden)
lstm_out2_en, self.hidden = self.lstm_en(embeds2_en)
lstm_out1_zh, self.hidden = self.lstm_zh(embeds1_zh)
lstm_out2_zh, self.hidden = self.lstm_zh(embeds1_zh)
en_sum = lstm_out1_en[-1] + lstm_out2_en[-1]
zh_sum = lstm_out1_zh[-1] + lstm_out2_zh[-1]
#print("embedding size:",en_sum.size(), zh_sum.size())
concat = torch.cat((en_sum, zh_sum), dim=1)
#print("lstm out:", lstm_out1[-1].size())
#print("concat:", concat.size())
fc1 = self.fc1_drop(F.relu(self.fc1(concat)))
fc2 = self.fc2(fc1)
return fc2
class MLP_Classifier(nn.Module):
def __init__(self, embedding_dim, vocab_size, target_size=3, seq_length=50):
super(MLP_Classifier, self).__init__()
self.embedding_dim = embedding_dim
self.word_embeddings = nn.Embedding(vocab_size+1, embedding_dim, padding_idx=0)
# The linear layer that maps from hidden state space to tag space
self.fc1 = nn.Linear(embedding_dim*2, embedding_dim*2)
self.fc1_bn = nn.BatchNorm1d(embedding_dim*2)
self.fc1_drop = nn.Dropout(p=0.5, inplace=False)
self.fc2 = nn.Linear(embedding_dim*2, target_size)
self.seq_length=seq_length
def forward(self, sentence1, sentence2):
embeds1 = self.word_embeddings(sentence1)
embeds1 = torch.sum(embeds1, 1)
#print("embed", embeds1.size())
embeds2 = self.word_embeddings(sentence2)
embeds2 = torch.sum(embeds2, 1)
#print("embedding size:",embeds1.size(), len(sentence1))
#embeds1 = embeds1.view(self.seq_length, len(sentence1), self.embedding_dim)
#embeds2 = embeds2.view(self.seq_length, len(sentence1), self.embedding_dim)
concat = torch.cat((embeds1, embeds2), dim=1)
#print("concat:", concat.size())
fc1 = self.fc1_drop(F.relu(self.fc1_bn(self.fc1(concat))))
fc2 = self.fc2(fc1)
return fc2
#Combine English and Chinese.
class Twolang_Classifier(nn.Module):
def __init__(self, embedding_dim, vocab_size_en, vocab_size_zh, target_size=3, seq_length_en=50, seq_length_zh=100, kernel_num=64):
super(Twolang_Classifier, self).__init__()
self.embedding_dim = embedding_dim
self.seq_length_en=seq_length_en
self.seq_length_zh=seq_length_zh
self.word_embeddings_en = nn.Embedding(vocab_size_en+1, embedding_dim, padding_idx=0)
self.word_embeddings_zh = nn.Embedding(vocab_size_zh+1, embedding_dim, padding_idx=0)
self.kernel_num=kernel_num
self.conv2_en = nn.Conv2d(1, kernel_num, (2, embedding_dim))
self.conv3_en = nn.Conv2d(1, kernel_num, (3, embedding_dim))
self.conv4_en = nn.Conv2d(1, kernel_num, (4, embedding_dim))
self.conv2 = nn.Conv2d(1, kernel_num, (2, embedding_dim))
self.conv3 = nn.Conv2d(1, kernel_num, (3, embedding_dim))
self.conv4 = nn.Conv2d(1, kernel_num, (4, embedding_dim))
#self.conv5 = nn.Conv2d(1, kernel_num, (5, embedding_dim))
self.Max2_pool_en = nn.MaxPool2d((self.seq_length_en-2+1, 1))
self.Max3_pool_en = nn.MaxPool2d((self.seq_length_en-3+1, 1))
self.Max4_pool_en = nn.MaxPool2d((self.seq_length_en-4+1, 1))
#self.Max5_pool = nn.MaxPool2d((self.seq_length-5+1, 1))
self.Max2_pool = nn.MaxPool2d((self.seq_length_zh-2+1, 1))
self.Max3_pool = nn.MaxPool2d((self.seq_length_zh-3+1, 1))
self.Max4_pool = nn.MaxPool2d((self.seq_length_zh-4+1, 1))
# The linear layer that maps from hidden state space to tag space
#self.fc1 = nn.Linear(embedding_dim*4, embedding_dim*4)
#self.fc1_bn = nn.BatchNorm1d(embedding_dim*4)
# self.fc1 = nn.Linear(embedding_dim+kernel_num*3, embedding_dim+kernel_num*3)
self.fc1 = nn.Linear(kernel_num*6, kernel_num*6)
self.fc1_bn = nn.BatchNorm1d(kernel_num*6)
self.fc1_drop = nn.Dropout(p=0.5, inplace=False)
self.fc2 = nn.Linear(kernel_num*6, target_size)
def forward(self, title1_en, title2_en, title1_zh, title2_zh):
batch = title1_en.shape[0]
embeds1_en = self.word_embeddings_en(title1_en)
#embeds1_en = torch.sum(embeds1_en, 1)
embeds1_en = embeds1_en.view(batch, 1, self.seq_length_en, self.embedding_dim)
embeds2_en = self.word_embeddings_en(title2_en)
#embeds2_en = torch.sum(embeds2_en, 1)
embeds2_en = embeds2_en.view(batch, 1, self.seq_length_en, self.embedding_dim)
#Convolution
embeds1_x2 = F.relu(self.conv2_en(embeds1_en))
embeds1_x3 = F.relu(self.conv3_en(embeds1_en))
embeds1_x4 = F.relu(self.conv4_en(embeds1_en))
#embeds1_x5 = F.relu(self.conv5(embeds1_zh))
embeds2_x2 = F.relu(self.conv2_en(embeds2_en))
embeds2_x3 = F.relu(self.conv3_en(embeds2_en))
embeds2_x4 = F.relu(self.conv4_en(embeds2_en))
#embeds2_x5 = F.relu(self.conv5(embeds2_zh))
# Pooling
embeds1_x2 = self.Max2_pool_en(embeds1_x2).view(batch, -1)
embeds1_x3 = self.Max3_pool_en(embeds1_x3).view(batch, -1)
embeds1_x4 = self.Max4_pool_en(embeds1_x4).view(batch, -1)
#embeds1_x5 = self.Max5_pool(embeds1_x5).view(batch, -1)
embeds2_x2 = self.Max2_pool_en(embeds2_x2).view(batch, -1)
embeds2_x3 = self.Max3_pool_en(embeds2_x3).view(batch, -1)
embeds2_x4 = self.Max4_pool_en(embeds2_x4).view(batch, -1)
#embeds2_x5 = self.Max5_pool(embeds2_x5).view(batch, -1)
embeds1_en = torch.cat((embeds1_x2, embeds1_x3, embeds1_x4), dim=1)
embeds2_en = torch.cat((embeds2_x2, embeds2_x3, embeds2_x4), dim=1)
en_sum = embeds1_en + embeds2_en
embeds1_zh = self.word_embeddings_zh(title1_zh)
#embeds1_zh = torch.sum(embeds1_zh, 1)
#For CNN.
embeds1_zh = embeds1_zh.view(batch, 1, self.seq_length_zh, self.embedding_dim)
embeds2_zh = self.word_embeddings_zh(title2_zh)
#embeds2_zh = torch.sum(embeds2_zh, 1)
#For CNN.
embeds2_zh = embeds2_zh.view(batch, 1, self.seq_length_zh, self.embedding_dim)
#Convolution
embeds1_x2 = F.relu(self.conv2(embeds1_zh))
embeds1_x3 = F.relu(self.conv3(embeds1_zh))
embeds1_x4 = F.relu(self.conv4(embeds1_zh))
#embeds1_x5 = F.relu(self.conv5(embeds1_zh))
embeds2_x2 = F.relu(self.conv2(embeds2_zh))
embeds2_x3 = F.relu(self.conv3(embeds2_zh))
embeds2_x4 = F.relu(self.conv4(embeds2_zh))
#embeds2_x5 = F.relu(self.conv5(embeds2_zh))
# Pooling
embeds1_x2 = self.Max2_pool(embeds1_x2).view(batch, -1)
embeds1_x3 = self.Max3_pool(embeds1_x3).view(batch, -1)
embeds1_x4 = self.Max4_pool(embeds1_x4).view(batch, -1)
#embeds1_x5 = self.Max5_pool(embeds1_x5).view(batch, -1)
embeds2_x2 = self.Max2_pool(embeds2_x2).view(batch, -1)
embeds2_x3 = self.Max3_pool(embeds2_x3).view(batch, -1)
embeds2_x4 = self.Max4_pool(embeds2_x4).view(batch, -1)
#embeds2_x5 = self.Max5_pool(embeds2_x5).view(batch, -1)
embeds1_zh = torch.cat((embeds1_x2, embeds1_x3, embeds1_x4), dim=1)
embeds2_zh = torch.cat((embeds2_x2, embeds2_x3, embeds2_x4), dim=1)
zh_sum = embeds1_zh + embeds2_zh
#print("embedding size:",embeds1.size(), len(sentence1))
#embeds1 = embeds1.view(self.seq_length, len(sentence1), self.embedding_dim)
#embeds2 = embeds2.view(self.seq_length, len(sentence1), self.embedding_dim)
#concat = torch.cat((embeds1_en, embeds2_en, embeds1_zh, embeds2_zh), dim=1)
concat = torch.cat((en_sum, zh_sum), dim=1)
fc1 = self.fc1_drop(F.relu(self.fc1_bn(self.fc1(concat))))
fc2 = self.fc2(fc1)
return fc2
class Text_CNN_Classifier(nn.Module):
def __init__(self, embedding_dim, vocab_size, target_size=3, seq_length=50):
super(Text_CNN_Classifier, self).__init__()
self.embedding_dim = embedding_dim
self.word_embeddings = nn.Embedding(vocab_size+1, embedding_dim, padding_idx=0)
self.seq_length=seq_length
self.conv3_1 = nn.Conv2d(1, 1, (3, embedding_dim))
self.conv4_1 = nn.Conv2d(1, 1, (4, embedding_dim))
self.conv5_1 = nn.Conv2d(1, 3, (5, embedding_dim))
self.conv3_2 = nn.Conv2d(1, 1, (3, embedding_dim))
self.conv4_2 = nn.Conv2d(1, 1, (4, embedding_dim))
self.conv5_2 = nn.Conv2d(1, 1, (5, embedding_dim))
self.Max3_pool = nn.MaxPool2d((self.seq_length-3+1, 1))
self.Max4_pool = nn.MaxPool2d((self.seq_length-4+1, 1))
self.Max5_pool = nn.MaxPool2d((self.seq_length-5+1, 1))
# The linear layer that maps from hidden state space to tag space
self.fc1 = nn.Linear(6, target_size)
def forward(self, sentence1, sentence2):
batch = len(sentence1)
embeds1 = self.word_embeddings(sentence1)
embeds2 = self.word_embeddings(sentence2)
embeds1 = embeds1.view(len(sentence1), 1, self.seq_length, self.embedding_dim)
embeds2 = embeds2.view(len(sentence2), 1, self.seq_length, self.embedding_dim)
# Convolution
embeds1_x1 = F.relu(self.conv3_1(embeds1))
embeds1_x2 = F.relu(self.conv4_1(embeds1))
embeds1_x3 = F.relu(self.conv5_1(embeds1))
# embeds2_x1 = F.relu(self.conv3_2(embeds2))
# embeds2_x2 = F.relu(self.conv4_2(embeds2))
# embeds2_x3 = F.relu(self.conv5_2(embeds2))
embeds2_x1 = F.relu(self.conv3_1(embeds2))
embeds2_x2 = F.relu(self.conv4_1(embeds2))
embeds2_x3 = F.relu(self.conv5_1(embeds2))
# Pooling
embeds1_x1 = self.Max3_pool(embeds1_x1)
embeds1_x2 = self.Max4_pool(embeds1_x2)
embeds1_x3 = self.Max5_pool(embeds1_x3)
embeds2_x1 = self.Max3_pool(embeds2_x1)
embeds2_x2 = self.Max4_pool(embeds2_x2)
embeds2_x3 = self.Max5_pool(embeds2_x3)
#print("max pool size:", embeds2_x3.size())
concat = torch.cat((embeds1_x1, embeds1_x2, embeds1_x3, embeds2_x1, embeds2_x2, embeds2_x3), -1)
x = concat.view(batch, -1)
#print("concat:", x.size())
fc1 = self.fc1(x)
#print("fc1:", fc1.size())
return fc1
DATASET
In [0]:
import pandas as pd
import numpy as np
from collections import Counter
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
class BERTDataset(Dataset):
def __init__(self, titles1_en, titles2_en, labels, tokenizer, seq_length=100):
self.titles1_en = titles1_en
self.titles2_en = titles2_en
self.labels = labels
self.tokenizer = tokenizer
self.seq_length=seq_length
def __len__(self):
return len(self.titles1_en)
def __getitem__(self, idx):
seq_length = self.seq_length
tokenizer = self.tokenizer
title1_en = self.titles1_en[idx]
tokens_a = tokenizer.tokenize(title1_en)
#indexed_tokens_title1_en = tokenizer.convert_tokens_to_ids(tokenized_title1_en)
title2_en = self.titles2_en[idx]
tokens_b = tokenizer.tokenize(title2_en)
#indexed_tokens_title2_en = tokenizer.convert_tokens_to_ids(tokenized_title2_en)
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
_truncate_seq_pair(tokens_a, tokens_b, seq_length-3)
tokens = []
input_type_ids = []
tokens.append("[CLS]")
input_type_ids.append(0)
for token in tokens_a:
tokens.append(token)
input_type_ids.append(0)
tokens.append("[SEP]")
input_type_ids.append(0)
for token in tokens_b:
tokens.append(token)
input_type_ids.append(1)
tokens.append("[SEP]")
input_type_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_mask = [1] * len(input_ids)
# Zero padding.
while len(input_ids) < seq_length:
input_ids.append(0)
input_mask.append(0)
input_type_ids.append(0)
#print("input_ids:{}, input_mask:{}, input_type_ids:{}".format(len(input_ids), len(input_mask), len(input_type_ids)))
assert len(input_ids) == seq_length
assert len(input_mask) == seq_length
assert len(input_type_ids) == seq_length
input_ids = torch.tensor(input_ids)
input_mask = torch.tensor(input_mask)
input_type_ids = torch.tensor(input_type_ids)
labels = torch.tensor(self.labels[idx], dtype=torch.long)
#
#
# tokens_tensor = torch.tensor(indexed_tokens_title1_en + indexed_tokens_title2_en)
# segments_tensor = torch.tensor(len(indexed_tokens_title1_en) * [0] + len(indexed_tokens_title2_en) * [1])
#
# assert len(tokens_tensor) == len(segments_ids)
#
# label = torch.tensor(self.labels[idx], dtype=torch.long)
#
sample = {'input_ids': input_ids, 'input_mask': input_mask,
'input_type_ids':input_type_ids, 'label': labels}
# if self.transform:
# sample = self.transform(sample, self.dic_en, self.dic_zh, self.seq_length_en, self.seq_length_zh)
return sample
# Dataset
class TitleDataset(Dataset):
"""Face Landmarks dataset."""
def __init__(self, titles1_en, titles2_en,
titles1_zh, titles2_zh, labels, dic_en=None, dic_zh=None,
transform=None, seq_length_en=50, seq_length_zh=140,
if_test=False):
self.titles1_en = titles1_en
self.titles2_en = titles2_en
self.titles1_zh = titles1_zh
self.titles2_zh = titles2_zh
self.labels = labels
self.transform = transform
self.dic_en=dic_en
self.dic_zh=dic_zh
self.seq_length_en=seq_length_en
self.seq_length_zh=seq_length_zh
self.if_test=if_test
def __len__(self):
return len(self.titles1_en)
def __getitem__(self, idx):
title1_en = self.titles1_en[idx]
title2_en = self.titles2_en[idx]
title1_zh = self.titles1_zh[idx]
title2_zh = self.titles2_zh[idx]
if self.if_test:
# dummy label
label = title1_en
else:
label = torch.tensor(self.labels[idx], dtype=torch.long)
sample = {'t1_en': title1_en, 't2_en': title2_en, 't1_zh': title1_zh, 't2_zh': title2_zh, 'label': label}
if self.transform:
sample = self.transform(sample, self.dic_en, self.dic_zh, self.seq_length_en, self.seq_length_zh)
return sample
class Toidx(object):
def __call__(self, sample, word_to_idx_en, word_to_idx_zh, max_seq_length_en, max_seq_length_zh):
def prepare_sequence(seq, to_ix, max_seq_length, language="english"):
seq = str(seq)
#zero padding and word--->ix in seq.
if language == "english":
idxs = [to_ix[w] for w in seq.split()]
elif language == "chinese":
idxs = [to_ix[w] for w in seq]
if len(idxs) > max_seq_length:
idxs = idxs[:max_seq_length]
else:
idxs += [0] * (max_seq_length - len(idxs))
return torch.tensor(idxs, dtype=torch.long)
t1_en, t2_en, t1_zh, t2_zh, label = sample['t1_en'], sample['t2_en'], sample['t1_zh'], sample['t2_zh'], sample["label"]
return {'t1_en': prepare_sequence(t1_en, word_to_idx_en, max_seq_length_en, language="english"),
't2_en': prepare_sequence(t2_en, word_to_idx_en, max_seq_length_en,language="english"),
't1_zh': prepare_sequence(t1_zh, word_to_idx_zh, max_seq_length_zh,language="chinese"),
't2_zh': prepare_sequence(t2_zh, word_to_idx_zh, max_seq_length_zh,language="chinese"),
'label': label}
TRAIN
In [0]:
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import Sampler
from sklearn.model_selection import train_test_split
import re
import os
import pickle
from nltk.corpus import stopwords
import nltk
#from preprocess import preprocess_, make_new_data
from sklearn.model_selection import KFold
def train(epoch):
model.train()
for batch_idx, sample_batch in enumerate((train_loader)):
#print("batch_idx:",batch_idx)
en_title1 = sample_batch["t1_en"].to(device)
en_title2 = sample_batch["t2_en"].to(device)
zh_title1 = sample_batch["t1_zh"].to(device)
zh_title2 = sample_batch["t2_zh"].to(device)
y = sample_batch["label"].to(device)
scheduler.step()
optimizer.zero_grad()
outputs = model(en_title1, en_title2, zh_title1, zh_title2)
loss = loss_function(outputs, y)
loss.backward()
optimizer.step()
#optimizer.zero_grad()
#outputs = model(en_title2, en_title1)
#loss = loss_function(outputs, y)
#loss.backward()
#optimizer.step()
print("epoch:{},train_loss:{:.4f}".format(epoch+1 ,loss))
#print("train data all :", (batch_idx+1)*batch)
return model
def test():
with torch.no_grad():
model.eval()
test_loss = 0
correct = 0
for batch_idx, sample_batch in enumerate(val_loader):
en_title1 = sample_batch["t1_en"].to(device)
en_title2 = sample_batch["t2_en"].to(device)
zh_title1 = sample_batch["t1_zh"].to(device)
zh_title2 = sample_batch["t2_zh"].to(device)
y = sample_batch["label"].to(device)
output = model(en_title1, en_title2, zh_title1, zh_title2)
# sum up batch loss
test_loss += weighted_loss_function(output, y).item()
# get the index of the max log-probability
pred = output.max(1, keepdim=True)[1]
correct += pred.eq(y.view_as(pred)).sum().item()
#test_loss /= len(val_loader.dataset)
test_loss /= batch_idx+1
#accuracy = 100. * correct / len(val_loader.dataset)
accuracy = weighted_accuracy(pred, y)
print('Validation set: Weighted loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)'
.format(test_loss, correct, len(val_loader.dataset),
accuracy))
return test_loss, accuracy
def weighted_accuracy(pred, true):
true = true.cpu().numpy()
pred = pred.cpu().numpy()
class_weight = [1/16, 1/15, 1/5]
score = 0
perfect_score = 0
for p, t in zip(pred, true):
if p == t:
if t == 0:
score += 1/16
perfect_score += 1/16
elif t == 1:
score += 1/15
perfect_score += 1/15
elif t == 2:
score += 1/5
perfect_score += 1/5
else:
if t == 0:
perfect_score += 1/16
elif t == 1:
perfect_score += 1/15
elif t == 2:
perfect_score += 1/5
#print("score:{}, ideal:{}".format(score, perfect_score))
return 100 * score/perfect_score
def save_model(model, val_accuracy, save_path=data_dir + 'model'):
# if os.path.exists(path + "*.model"):
# os.remove(path + "*.model")
name = "{}fold_mlp.model".format(fold)
PATH = os.path.join(save_path, name)
torch.save(model, PATH)
def get_lr(optimizer):
for param_group in optimizer.param_groups:
return param_group['lr']
In [0]:
'''
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
train_df = pd.read_csv(data_dir + "train.csv")
FOLDS_PATH = None
if FOLDS_PATH is None:
folds = KFold(n_splits=5, shuffle=False, random_state=42)
folds_idx = [(train_idx, val_idx)
for train_idx, val_idx in folds.split(train_df)]
with open(data_dir + 'save/5Kfolds.pkl', mode='wb') as f:
pickle.dump(folds_idx, f)
print (folds_idx)
'''
Out[0]:
In [0]:
EMBEDDING_DIM = 512
HIDDEN_DIM = 256
max_seq_en = 50
max_seq_zh = 100
EPOCH=9
batch=1024
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:",device)
# with open('save/word_to_ix_en.pickle', mode='rb') as f:
# word_to_ix_en = pickle.load(f)
# with open('save/word_to_ix_zh.pickle', mode='rb') as f:
# word_to_ix_zh = pickle.load(f)
print("@preprocessing..")
#_ = preprocess_()
# Data loading
with open(data_dir + 'save/word_to_ix_en.pickle', mode='rb') as f:
word_to_ix_en = pickle.load(f)
with open(data_dir + 'save/word_to_ix_zh.pickle', mode='rb') as f:
word_to_ix_zh = pickle.load(f)
with open(data_dir + 'save/train_df.pickle', mode='rb') as f:
train_df = pickle.load(f)
with open(data_dir + 'save/test_df.pickle', mode='rb') as f:
test_df = pickle.load(f)
train_df = train_df.sample(frac=1, random_state=0).reset_index(drop=True)
fold_num = 5
kf = KFold(n_splits=fold_num, random_state = 42)
kf.get_n_splits(train_df)
train_data_list = []
val_data_list = []
'''
for train_index, val_index in kf.split(train_df):
training_df = train_df.iloc[train_index]
val_df = train_df.iloc[val_index]
new_data, _, _, _ = make_new_data(training_df)
train1_en, train2_en = [],[]
train1_zh, train2_zh = [],[]
y_train = []
for text1_en, text2_en, text1_zh, text2_zh,label in new_data:
train1_en.append(text1_en)
train2_en.append(text2_en)
train1_zh.append(text1_zh)
train2_zh.append(text2_zh)
y_train.append(label)
val1_en, val2_en = list(val_df["title1_en"]), list(val_df["title2_en"])
val1_zh, val2_zh = list(val_df["title1_zh"]), list(val_df["title2_zh"])
y_val = list(val_df["label"])
train_data_list.append((train1_en,train2_en,train1_zh,train2_zh,y_train))
val_data_list.append((val1_en, val2_en,val1_zh, val2_zh,y_val))
with open(data_dir + 'save/kfold_train_data.pickle', mode='wb') as f:
pickle.dump(train_data_list, f)
with open(data_dir + 'save/kfold_val_data.pickle', mode='wb') as f:
pickle.dump(val_data_list, f)
'''
with open(data_dir + 'save/kfold_train_data.pickle', mode='rb') as f:
train_data_list = pickle.load(f)
with open(data_dir + 'save/kfold_val_data.pickle', mode='rb') as f:
val_data_list = pickle.load(f)
PATH = data_dir+ "model/MLP.model"
PATH_list = [data_dir + "model/{}fold_mlp.model".format(fold) for fold in range(1,6,1)]
folds_accuracies = []
Pretrained = False
fold=1
for train_fold, val_fold in zip(train_data_list,val_data_list):
print("{}/{} fold :".format(fold, fold_num))
print("train length:{}, val length:{}".format(len(train_fold[0]), len(val_fold[0])))
(train1_en,train2_en,train1_zh,train2_zh,y_train) = train_fold
(val1_en, val2_en,val1_zh, val2_zh,y_val) = val_fold
# Class weight gan be got as : n_samples / (n_classes * np.bincount(y))
c = Counter(y_train)
class_weight = []
for label, num in sorted(c.items()):
print(label, num)
class_weight.append(len(y_train)/(3*num))
#class_weight = torch.FloatTensor(class_weight).to(device)
#print("class weight:", class_weight)
model = LSTM_Classifier(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix_en), len(word_to_ix_zh), target_size=3, seq_length_en=max_seq_en,seq_length_zh=max_seq_zh)
#model = MLP_Classifier(EMBEDDING_DIM, len(word_to_ix_en), target_size=3, seq_length=max_seq_en)
#model = Text_CNN_Classifier(EMBEDDING_DIM, len(word_to_ix_en), target_size=3, seq_length=max_seq_length)
#model = Twolang_Classifier(EMBEDDING_DIM, len(word_to_ix_en),len(word_to_ix_zh), target_size=3, kernel_num=64)
model.to(device)
train_dataset = TitleDataset(train1_en, train2_en, train1_zh, train2_zh, y_train,
dic_en=word_to_ix_en, dic_zh=word_to_ix_zh, transform=Toidx(),
seq_length_en=max_seq_en, seq_length_zh=max_seq_zh)
val_dataset = TitleDataset(val1_en, val2_en, val1_zh, val2_zh, y_val,
dic_en=word_to_ix_en, dic_zh=word_to_ix_zh, transform=Toidx(),
seq_length_en=max_seq_en, seq_length_zh=max_seq_zh)
class_sample_count = np.array([len(np.where(y_train == t)[0]) for t in np.unique(y_train)])
weight = 1. / class_sample_count
samples_weight = np.array([weight[t] for t in y_train])
samples_weight = torch.from_numpy(samples_weight)
samples_weight = samples_weight.double()
sampler = torch.utils.data.sampler.WeightedRandomSampler(samples_weight, len(samples_weight))
train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=False, sampler=sampler)#, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=512, shuffle=False)
loss_function = nn.CrossEntropyLoss()#weight=class_weight)
weighted_loss_function = nn.CrossEntropyLoss()#weight=class_weight)
#optimizer = optim.SGD(model.parameters(), lr=0.001)
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = CosineAnnealingLR(optimizer, len(train_loader), eta_min = 0.001/10)
lowest_loss = 1000000000
highest_accuracy = 0
for epoch in range(EPOCH):
#print(epoch+1)
if Pretrained == True and highest_accuracy == 0:
name = "{}fold_mlp.model".format(fold)
PATH = os.path.join(data_dir + 'model', name)
model = torch.load(PATH)
print ('Pretrained model loaded')
model = train(epoch)
val_loss, accuracy = test()
# if val_loss < lowest_loss:
# lowest_loss = val_loss
# save_model(model)
if accuracy > highest_accuracy:
#print("saving model...")
highest_accuracy = accuracy
save_model(model, highest_accuracy)
print("highest_accuracy:{:.2f}% \n".format(highest_accuracy), 'current lr: ', get_lr(optimizer))
folds_accuracies.append(highest_accuracy)
#break
fold +=1
print ('Final mean accuracy: ', np.mean(folds_accuracies))
In [0]:
torch.cuda.is_available()
TEST
In [0]:
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm as tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import Sampler
import re
import os
#from model import *
#from dataset import TitleDataset, Toidx
#from preprocess import preprocess_, make_new_data
import pickle
from collections import defaultdict
# _ = preprocess_()
with open(data_dir + 'save/word_to_ix_en.pickle', mode='rb') as f:
word_to_ix_en = pickle.load(f)
with open(data_dir + 'save/word_to_ix_zh.pickle', mode='rb') as f:
word_to_ix_zh = pickle.load(f)
with open(data_dir + 'save/train_df.pickle', mode='rb') as f:
train_df = pickle.load(f)
with open(data_dir + 'save/test_df.pickle', mode='rb') as f:
test_df = pickle.load(f)
#_,given_dic,fixed_dic,forecast_dic = make_new_data(train_df)
with open(data_dir + 'save/fixed_dic.pickle', mode='rb') as f:
fixed_dic = pickle.load(f)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
EMBEDDING_DIM = 512
HIDDEN_DIM = 128
max_seq_en = 50
max_seq_zh = 100
model = LSTM_Classifier(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), target_size=3, seq_length=max_seq_length)
#model = MLP_Classifier(EMBEDDING_DIM, len(word_to_ix), target_size=3, seq_length=max_seq_length)
#model = Twolang_Classifier(EMBEDDING_DIM, len(word_to_ix_en),len(word_to_ix_zh), target_size=3)
title1_en_test = list(test_df["title1_en"])
title2_en_test = list(test_df["title2_en"])
title1_zh_test = list(test_df["title1_zh"])
title2_zh_test = list(test_df["title2_zh"])
test_tid1 = list(test_df["tid1"])
test_tid2 = list(test_df["tid2"])
id_ = test_df["id"]
preded_id_label = []
given, not_given = 0, 0
agree_dic = defaultdict(list)
disagree_dic = defaultdict(list)
for id1, id_label_list in fixed_dic.items():
if len(id_label_list) == 0:
continue
id_list = np.array(id_label_list)[:,0]
label_list = np.array(id_label_list)[:,1]
for id2, label in zip(id_list, label_list):
if label == 1:
agree_dic[id1].append(id2)
elif label == 2:
disagree_dic[id1].append(id2)
change=0
while True:
for tid1, agree_id_list in agree_dic.items():
for tid2 in agree_id_list:
disagree_to_tid2 = disagree_dic[tid2]
for dis in disagree_to_tid2:
if not dis in disagree_dic[tid1]:
disagree_dic[tid1].append(dis)
change+=1
if not tid1 in disagree_dic[dis]:
disagree_dic[dis].append(tid1)
change+=1
agree_to_tid2 = agree_dic[tid2]
for dis in agree_to_tid2:
if not dis in agree_dic[tid1]:
agree_dic[tid1].append(dis)
change+=1
if not tid1 in agree_dic[dis]:
agree_dic[dis].append(tid1)
change+=1
for tid1, disagree_id_list in disagree_dic.items():
for tid2 in disagree_id_list:
agree_to_tid2 = agree_dic[tid2]
for dis in agree_to_tid2:
if not dis in disagree_dic[tid1]:
disagree_dic[tid1].append(dis)
change+=1
if not tid1 in disagree_dic[dis]:
disagree_dic[dis].append(tid1)
change+=1
print("change number: ", change)
if change == 0:
break
else:
change = 0
mujun = 0
for id1, id2, each_id in zip(test_tid1, test_tid2, id_):
if id2 in disagree_dic[id1]:
#check
if id1 in disagree_dic[id2]:
preded_id_label.append((each_id, 2))
else:
mujun+=1
elif id2 in agree_dic[id1]:
#check
if id1 in agree_dic[id2]:
preded_id_label.append((each_id, 1))
else:
mujun+=1
preded_id_label = []
print("What could be predicted:{}, Contradiction:{}, total:{}".format(len(preded_id_label), mujun, len(test_df)))
#
# for id1, id2, each_id in zip(test_tid1, test_tid2, id_):
# if not id1 in forecast_dic.keys():
# #print("label cannot be predicted")
# not_given+=1
# pass
# else:
# forecast_data_label = np.array(forecast_dic[id1])
# if len(forecast_data_label) == 0:
# continue
#
# forecast_id = forecast_data_label[:,0]
# forecast_label = forecast_data_label[:,1]
#
# if id2 in forecast_id:
# idx = list(forecast_id).index(id2)
# label = forecast_label[idx]
# given+=1
# # preded_id_label.append((each_id, label))
# else:
# #print("label not given")
# not_given+=1
# pass
# print("予測可能セット:{}, わからないセット:{}".format(given, not_given))
PATH = data_dir+ "model/MLP.model"
PATH_list = [data_dir + "model/{}fold_mlp.model".format(fold) for fold in range(1,6,1)]
average_prediction = []
for PATH in PATH_list:
model = torch.load(PATH)
print("model loaded:{}".format(PATH))
# test dataset. label is None.
test_dataset = TitleDataset(title1_en_test, title2_en_test, title1_zh_test, title2_zh_test, None,
dic_en=word_to_ix_en, dic_zh=word_to_ix_zh, transform=Toidx(),
seq_length_en=max_seq_en, seq_length_zh=max_seq_zh, if_test=True)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)
with torch.no_grad():
model.eval()
predictions = []
for batch_idx, sample_batch in enumerate(tqdm(test_loader)):
en_title1 = sample_batch["t1_en"].to(device)
en_title2 = sample_batch["t2_en"].to(device)
zh_title1 = sample_batch["t1_zh"].to(device)
zh_title2 = sample_batch["t2_zh"].to(device)
output = model(en_title1, en_title2, zh_title1, zh_title2)
# pred = output.max(1, keepdim=True)[1].cpu()
#print("model out :",output.size())
#predictions.extend(list(pred.numpy()))
output = output.cpu().numpy()
#print("model out:",output.shape)
if batch_idx == 0:
predictions = output
else:
predictions = np.vstack((predictions, output))
average_prediction.append(predictions)
average_prediction = np.array(average_prediction)
# print("total pred:", average_prediction.shape)
average_prediction = np.mean(average_prediction, axis=0)
# print("total pred:", average_prediction.shape)
predictions = np.argmax(average_prediction, axis=1)
print("predictions:", predictions.shape)
#'unrelated', 0
#'agreed', 1
#'disagreed', 2
if len(preded_id_label) == 0:
preded_labels = []
preded_id = []
else:
preded_id = np.array(preded_id_label)[:, 0]
preded_labels = np.array(preded_id_label)[:, 1]
print("directly preded label:", len(preded_id))
fixed_predictions = []
for each_id, p in zip(id_, predictions):
if each_id in preded_id:
idx = list(preded_id).index(each_id)
fixed_predictions.append(preded_labels[idx])
else:
fixed_predictions.append(p)
new_predictions = []
for p in fixed_predictions:
if p == 0:
new_predictions.append("unrelated")
elif p==1:
new_predictions.append("agreed")
elif p==2:
new_predictions.append("disagreed")
#
# c = Counter(list(predictions))
# print("original",c)
#
# c = Counter(fixed_predictions)
# print("fixed", c)
submit_csv = pd.concat([id_, pd.Series(new_predictions)], axis=1)
#display(submit_csv)
submit_csv.columns = ["Id", "Category"]
submit_csv.to_csv(data_dir + "submit.csv", header=True, index=False)
In [0]:
submit_csv.to_csv(data_dir + "submit.csv", header=True, index=False)
submit = pd.read_csv("submit.csv")
In [0]:
import requests
from bs4 import BeautifulSoup
search = "1000人犯罪团伙来德州偷孩子取器官,男子散播“1000人来德州偷孩子挖器官”谣言"
r = requests.get("https://www.google.com/search", params={'q':search})
soup = BeautifulSoup(r.text, "html.parser")
res = soup.find("div", {"id": "resultStats"})
print (res.text)
TRAIN CHINESE BERT
In [0]:
!pip install pytorch_pretrained_bert
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm as tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from sklearn.model_selection import train_test_split
import re
import os
from nltk.corpus import stopwords
import nltk
#nltk.download('stopwords')
import copy
# from model import BERT_Classifier
#from dataset import *
from collections import defaultdict
from sklearn.model_selection import KFold
import requests
class BERT_Classifier(nn.Module):
def __init__(self, bert_model, target_size=3):
super(BERT_Classifier, self).__init__()
self.embedding_dim=768
kernel_num=256
self.seq_length_en=100
self.bert_model = bert_model
# self.conv2_en = nn.Conv2d(1, kernel_num, (2, self.embedding_dim))
# self.conv3_en = nn.Conv2d(1, kernel_num, (3, self.embedding_dim))
# self.conv4_en = nn.Conv2d(1, kernel_num, (4, self.embedding_dim))
# self.Max2_pool_en = nn.MaxPool2d((self.seq_length_en-2+1, 1))
# self.Max3_pool_en = nn.MaxPool2d((self.seq_length_en-3+1, 1))
# self.Max4_pool_en = nn.MaxPool2d((self.seq_length_en-4+1, 1))
# self.fc1 = nn.Linear(kernel_num*3, 300)
# self.fc1_bn = nn.BatchNorm1d(300)
# self.fc1_drop = nn.Dropout(p=0.3, inplace=False)
# self.fc2 = nn.Linear(300, target_size)
self.fc1 = nn.Linear(768, 768)
#self.fc1_bn = nn.BatchNorm1d(300)
self.fc1_drop = nn.Dropout(p=0.3, inplace=False)
self.activation = nn.Tanh()
self.fc2 = nn.Linear(768, target_size)
def forward(self, input_ids, input_mask):
batch = len(input_ids)
last_encoder_layer, _ = self.bert_model(input_ids, token_type_ids=None, attention_mask=input_mask, output_all_encoded_layers=False)
# last_encoder_layer = last_encoder_layer.view(batch, 1, self.seq_length_en, self.embedding_dim)
#
#
# conv2 = F.relu(self.conv2_en(last_encoder_layer))
# conv3 = F.relu(self.conv3_en(last_encoder_layer))
# conv4 = F.relu(self.conv4_en(last_encoder_layer))
#
# pool2 = self.Max2_pool_en(conv2).view(batch, -1)
# pool3 = self.Max3_pool_en(conv3).view(batch, -1)
# pool4 = self.Max4_pool_en(conv4).view(batch, -1)
#print(last_encoder_layer.size())
# embedding = torch.sum(last_encoder_layer, 1)
#cat = torch.cat((pool2, pool3, pool4), dim=1)
#print("fc1", cat.size())
first_token_tensor = last_encoder_layer[:, 0]
# fc1 = self.fc1_drop(F.relu(self.fc1(first_token_tensor)))
fc1 = self.fc1_drop(self.activation(self.fc1(first_token_tensor)))
fc2 = self.fc2(fc1)
return fc2
EMBEDDING_DIM = 512
HIDDEN_DIM = 256
max_seq_en = 50
max_seq_zh = 60
EPOCH= 5
batch=64
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:",device)
train_df = pd.read_csv(data_dir + "train.csv")
# test_df = pd.read_csv("data/test.csv")
train_df.replace('unrelated', 0, inplace=True)
train_df.replace('agreed', 1, inplace=True)
train_df.replace('disagreed', 2, inplace=True)
def chinese_clean_series(series):
def clean_seq(seq):
seq = str(seq)
ori = copy.copy(seq)
seq = seq.replace("< i >", "")
seq = seq.replace("< / i >", "")
seq = seq.replace("\n", "")
seq = re.sub(r'[,."''“”。、#()→⇒←↓↑:;_㊙️【《》=|/<>]+', '', seq)
seq = re.sub(r'[!!??-]+', ' ', seq)
seq = re.sub(r'[$]+', '$ ', seq)
seq = re.sub(r'[0-9]+', '<NUM>', seq)
if len(seq)==0:
print("0 lengrh assert!!,",ori, seq)
return seq
series = series.apply(clean_seq)
return series
train_df["title1_zh"] = chinese_clean_series(train_df["title1_zh"])
train_df["title2_zh"] = chinese_clean_series(train_df["title2_zh"])
train_df = train_df.sample(frac=1, random_state=0).reset_index(drop=True)#.iloc[:300, :]
# K-Fold Cross validation
fold_num = 5
kf = KFold(n_splits=fold_num , random_state = 42)
kf.get_n_splits(train_df)
# kf.get_n_splits(X, y)
train_data_list = []
val_data_list = []
for train_index, val_index in kf.split(train_df):
#for train_index, val_index in kf.split(X):
training_df = train_df.iloc[train_index]
val_df = train_df.iloc[val_index]
train1_en, train2_en = list(training_df["title1_en"]), list(training_df["title2_en"])
train1_zh, train2_zh = list(training_df["title1_zh"]), list(training_df["title2_zh"])
y_train = list(training_df["label"])
val1_en, val2_en = list(val_df["title1_en"]), list(val_df["title2_en"])
val1_zh, val2_zh = list(val_df["title1_zh"]), list(val_df["title2_zh"])
y_val = list(val_df["label"])
train_data_list.append((train1_zh,train2_zh, y_train))#train1_zh,train2_zh,y_train))
val_data_list.append((val1_zh, val2_zh, y_val))# val1_zh, val2_zh,y_val))
#
# with open('save/kfold_train_data.pickle', mode='wb') as f:
# pickle.dump(train_data_list, f)
# with open('save/kfold_val_data.pickle', mode='wb') as f:
# pickle.dump(val_data_list, f)
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
bert_model = BertModel.from_pretrained('bert-base-chinese').to(device)
bert_model.eval()
fold=1
for train_fold, val_fold in zip(train_data_list,val_data_list):
print("{}/{} fold :".format(fold, fold_num))
print("train length:{}, val length:{}".format(len(train_fold[0]), len(val_fold[0])))
(train1_en,train2_en,y_train) = train_fold
(val1_en, val2_en,y_val) = val_fold
c = Counter(y_train)
class_weight = []
for label, num in sorted(c.items()):
print(label, num)
class_weight.append(len(y_train)/(3*num))
class_weight = torch.FloatTensor(class_weight).to(device)
model = BERT_Classifier(bert_model)
model.to(device)
loss_function = nn.CrossEntropyLoss()#weight=class_weight)
weighted_loss_function = nn.CrossEntropyLoss(weight=class_weight)#weight=class_weight)
train_dataset = BERTDataset(train1_zh, train2_zh, y_train, tokenizer, seq_length=max_seq_zh)
val_dataset = BERTDataset(val1_zh, val2_zh, y_val, tokenizer, seq_length=max_seq_zh)
class_sample_count = np.array([len(np.where(y_train == t)[0]) for t in np.unique(y_train)])
weight = 1. / class_sample_count
samples_weight = np.array([weight[t] for t in y_train])
samples_weight = torch.from_numpy(samples_weight)
samples_weight = samples_weight.double()
sampler = torch.utils.data.sampler.WeightedRandomSampler(samples_weight, len(samples_weight))
train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=False, sampler=sampler)#, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch, shuffle=False)
#optimizer = optim.SGD(model.parameters(), lr=0.001)
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = CosineAnnealingLR(optimizer, len(train_loader), eta_min = 0.001/10)
def train(epoch):
model.train()
for batch_idx, sample_batch in enumerate(tqdm(train_loader)):
input_ids = sample_batch["input_ids"].to(device)
input_mask = sample_batch["input_mask"].to(device)
input_type_ids = sample_batch["input_type_ids"].to(device)
y = sample_batch["label"].to(device)
scheduler.step()
optimizer.zero_grad()
outputs = model(input_ids, input_mask)
loss = loss_function(outputs, y)
loss.backward()
optimizer.step()
if batch_idx%100==0:
print("epoch:{},train_loss:{:.4f}".format(epoch+1 ,loss))
print("epoch:{},train_loss:{:.4f}".format(epoch+1 ,loss))
#print("train data all :", (batch_idx+1)*batch)
return model
def test():
with torch.no_grad():
model.eval()
test_loss = 0
correct = 0
for batch_idx, sample_batch in enumerate(val_loader):
input_ids = sample_batch["input_ids"].to(device)
input_mask = sample_batch["input_mask"].to(device)
input_type_ids = sample_batch["input_type_ids"].to(device)
y = sample_batch["label"].to(device)
output = model(input_ids, input_mask)
# sum up batch loss
#test_loss += weighted_loss_function(output, y).item()
test_loss += loss_function(output, y).item()
# get the index of the max log-probability
pred = output.max(1, keepdim=True)[1]
correct += pred.eq(y.view_as(pred)).sum().item()
#test_loss /= len(val_loader.dataset)
test_loss /= batch_idx+1
#accuracy = 100. * correct / len(val_loader.dataset)
accuracy = weighted_accuracy(pred, y)
print('Validation set: Weighted loss: {:.4f}, Weighted Accuracy: {}/{} ({:.2f}%)'
.format(test_loss, correct, len(val_loader.dataset),
accuracy))
return test_loss, accuracy
def weighted_accuracy(pred, true):
true = true.cpu().numpy()
pred = pred.cpu().numpy()
class_weight = [1/16, 1/15, 1/5]
score = 0
perfect_score = 0
for p, t in zip(pred, true):
if p == t:
if t == 0:
score += 1/16
perfect_score += 1/16
elif t == 1:
score += 1/15
perfect_score += 1/15
elif t == 2:
score += 1/5
perfect_score += 1/5
else:
if t == 0:
perfect_score += 1/16
elif t == 1:
perfect_score += 1/15
elif t == 2:
perfect_score += 1/5
#print("score:{}, ideal:{}".format(score, perfect_score))
return 100 * score/perfect_score
def save_model(model, val_accuracy, save_path=data_dir + 'model'):
# if os.path.exists(path + "*.model"):
# os.remove(path + "*.model")
name = "{}fold_mlp.model".format(fold)
PATH = os.path.join(save_path, name)
torch.save(model, PATH)
lowest_loss = 1000000000
highest_accuracy = 0
for epoch in range(EPOCH):
#print(epoch+1)
model = train(epoch)
val_loss, accuracy = test()
# if val_loss < lowest_loss:
# lowest_loss = val_loss
# save_model(model)
if accuracy > highest_accuracy:
#print("saving model...")
highest_accuracy = accuracy
save_model(model, highest_accuracy)
print("highest_accuracy:{:.2f}% \n".format(highest_accuracy))
fold+=1
In [0]:
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
process = psutil.Process(os.getpid())
print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()
In [0]:
!pip install pytorch_pretrained_bert
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm as tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from sklearn.model_selection import train_test_split
import re
import os
from nltk.corpus import stopwords
import nltk
#nltk.download('stopwords')
# from model import BERT_Classifier
from dataset import *
from collections import defaultdict
from sklearn.model_selection import KFold
import random
class BERT_Classifier(nn.Module):
def __init__(self,target_size=3):
super(BERT_Classifier, self).__init__()
self.fc1 = nn.Linear(768, 768)
self.fc1_bn = nn.BatchNorm1d(768)
self.fc1_drop = nn.Dropout(p=0.3, inplace=False)
self.fc2 = nn.Linear(768, target_size)
def forward(self, last_encoder_layer):#, input_ids, input_mask):
#last_encoder_layer, _ = self.bert_model(input_ids, token_type_ids=None, attention_mask=input_mask, output_all_encoded_layers=False)
#print(last_encoder_layer.size())
embedding = torch.sum(last_encoder_layer, 1)
#print("embedding", embedding.size())
fc1 = self.fc1_drop(F.relu(self.fc1_bn(self.fc1(embedding))))
fc2 = self.fc2(fc1)
return fc2
EMBEDDING_DIM = 512
HIDDEN_DIM = 256
max_seq_en = 50
max_seq_zh = 100
EPOCH=10
batch=32
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:",device)
train_df = pd.read_csv("data/train.csv")
train_df.replace('unrelated', 0, inplace=True)
train_df.replace('agreed', 1, inplace=True)
train_df.replace('disagreed', 2, inplace=True)
X = pd.read_pickle("save/features.pickle")
print("X:", X.shape)
y = list(train_df["label"])
p = list(zip(X, y))
random.shuffle(p)
X, y = zip(*p)
X = np.array(X)
y = np.array(y)
# K-Fold Cross validation
fold_num = 5
kf = KFold(n_splits=fold_num)
kf.get_n_splits(X, y)
train_data_list = []
val_data_list = []
fold=1
for train_index, val_index in kf.split(X):
X_train = X[train_index]
X_val = X[val_index]
y_train = y[train_index]
y_val = y[val_index]
print("{}/{} fold :".format(fold, fold_num))
print("train length:{}, val length:{}".format(len(X_train), len(X_val)))
c = Counter(y_train)
class_weight = []
for label, num in sorted(c.items()):
print(label, num)
class_weight.append(len(y_train)/(3*num))
class_weight = torch.FloatTensor(class_weight).to(device)
model = BERT_Classifier()
model.to(device)
loss_function = nn.CrossEntropyLoss()#weight=class_weight)
weighted_loss_function = nn.CrossEntropyLoss(weight=class_weight)#weight=class_weight)
#optimizer = optim.SGD(model.parameters(), lr=0.001)
optimizer = optim.Adam(model.parameters(), lr=0.001)
train_dataset = torch.utils.data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
val_dataset = torch.utils.data.TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))
#ミニバッチ内のクラス比を揃える.
class_sample_count = np.array([len(np.where(y_train == t)[0]) for t in np.unique(y_train)])
weight = 1. / class_sample_count
samples_weight = np.array([weight[t] for t in y_train])
samples_weight = torch.from_numpy(samples_weight)
samples_weight = samples_weight.double()
sampler = torch.utils.data.sampler.WeightedRandomSampler(samples_weight, len(samples_weight))
train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=False, sampler=sampler)#, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
def train(epoch):
model.train()
for batch_idx, sample_batch in enumerate(tqdm(train_loader)):
inputs, y = sample_batch
inputs = inputs.to(device)
y = y.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = loss_function(outputs, y)
loss.backward()
optimizer.step()
print("epoch:{},train_loss:{:.4f}".format(epoch+1 ,loss))
#print("train data all :", (batch_idx+1)*batch)
return model
def test():
with torch.no_grad():
model.eval()
test_loss = 0
correct = 0
for batch_idx, sample_batch in enumerate(val_loader):
inputs, y = sample_batch
inputs = inputs.to(device)
y = y.to(device)
output = model(inputs)
# sum up batch loss
test_loss += weighted_loss_function(output, y).item()
# get the index of the max log-probability
pred = output.max(1, keepdim=True)[1]
correct += pred.eq(y.view_as(pred)).sum().item()
#test_loss /= len(val_loader.dataset)
test_loss /= batch_idx+1
#accuracy = 100. * correct / len(val_loader.dataset)
accuracy = weighted_accuracy(pred, y)
print('Validation set: Weighted loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)'
.format(test_loss, correct, len(val_loader.dataset),
accuracy))
return test_loss, accuracy
def weighted_accuracy(pred, true):
true = true.cpu().numpy()
pred = pred.cpu().numpy()
class_weight = [1/16, 1/15, 1/5]
score = 0
perfect_score = 0
for p, t in zip(true, pred):
if p == t:
if t == 0:
score += 1/16
perfect_score += 1/16
elif t == 1:
score += 1/15
perfect_score += 1/15
elif t == 2:
score += 1/5
perfect_score += 1/5
else:
if t == 0:
perfect_score += 1/16
elif t == 1:
perfect_score += 1/15
elif t == 2:
perfect_score += 1/5
#print("score:{}, ideal:{}".format(score, perfect_score))
return 100 * score/perfect_score
def save_model(model, val_accuracy, save_path="model/BERT/"):
# if os.path.exists(path + "*.model"):
# os.remove(path + "*.model")
name = "{}fold_mlp.model".format(fold)
PATH = os.path.join(save_path, name)
torch.save(model, PATH)
lowest_loss = 1000000000
highest_accuracy = 0
for epoch in range(EPOCH):
#print(epoch+1)
model = train(epoch)
val_loss, accuracy = test()
# if val_loss < lowest_loss:
# lowest_loss = val_loss
# save_model(model)
if accuracy > highest_accuracy:
#print("saving model...")
highest_accuracy = accuracy
#save_model(model, highest_accuracy)
print("highest_accuracy:{:.2f}% \n".format(highest_accuracy))
fold+=1